#!/usr/bin/env bash

# setting colors to use
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
RED='\033[0;31m'
NC='\033[0m'

## help info ##
# called by program name with no arguments or with "-h" as only positional argument
if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then


    printf "\n --------------------------------  HELP INFO  --------------------------------- \n\n"
    printf "  This program downloads assembly files for NCBI genomes. It takes as input\n"
    printf "  assembly accessions (either GCA_* or GCF_*) and optionally a specification of\n"
    printf "  which format to download. For version info, run \`bit-version\`.\n\n"

    printf "    Required input:\n\n"
    printf "        - [-w <file>] single-column file of NCBI assembly accessions\n\n"

    printf "    Optional arguments include:\n\n"

    printf "        - [-f <str>] default: genbank\n"
    printf "                  Specify the desired format. Available options currently\n"
    printf "                  include: genbank, fasta, protein, nt_cds, gff, feature_tab, report, stats.\n\n"

    printf "        - [-j <int> ] default: 1\n"
    printf "                  The number of downloads you'd like to run in parallel. Write speeds\n"
    printf "                  can become problematic generally with around 15 or more, so a max\n"
    printf "                  of 12 will be used to be safe even if more are requested.\n\n"

    printf "    Example usage:\n\n\t bit-dl-ncbi-assemblies -w ncbi_accessions.txt -f protein -j 4\n\n"

    exit
fi

printf "\n"

## setting default ##
format="genbank"
num_jobs=1

## parsing arguments
while getopts :w:f:j: args
do
    case "${args}"
    in
        w) NCBI_acc_file=${OPTARG};;
        f) format=${OPTARG};;
        j) num_jobs=${OPTARG};;
        \?) printf "\n  ${RED}Invalid argument: -${OPTARG}${NC}\n\n    Run 'bit-dl-ncbi-assemblies' with no arguments or '-h' only to see help menu.\n\n" >&2 && exit
    esac
done

# setting max
if [ $num_jobs -gt "12" ]; then
    num_jobs=12
fi

## making sure input file was provided ##
if [ ! -n "$NCBI_acc_file" ]; then
    printf "\n  ${RED}You need to provide an input file with NCBI accessions!${NC}\n"
    printf "\nExiting for now.\n\n"
    exit
fi

## making sure format specified is interpretable
if [[ "$format" != "genbank" && $format != "fasta" && $format != "protein" && $format != "nt_cds" && $format != "gff" && $format != "feature_tab" && $format != "report" && $format != "stats" ]]; then
    printf "\n  ${RED}Invalid argument passed to \'-f' option: $format\n\n${NC}"
    printf "  Valid options are genbank, fasta, protein, gff, nt_cds, feature_tab, report, or stats.\n"
    printf "Exiting for now.\n\n"
    exit
fi


## checking no duplicates in input file ##
if [ -f "$NCBI_acc_file" ]; then
    num_dupes=$(uniq -d "$NCBI_acc_file" | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}      $NCBI_acc_file has duplicate entries, check it out and provide unique accessions only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi


## making sure input file is there, and storing total number of targets ##
if [ -f "$NCBI_acc_file" ]; then
    NCBI_input_genomes_total=$(wc -l $NCBI_acc_file | sed "s/^ *//" | cut -d " " -f 1)
    printf "    Targeting $NCBI_input_genomes_total genomes in $format format.\n\n"
else
    printf "\n${RED}      You specified $NCBI_acc_file, but that file cannot be found :(${NC}\n"
    printf "\nExiting for now.\n\n"
    exit
fi


# making sure output file to hold accessions not downloaded doesn't exist already, as it gets appended to
rm -rf NCBI-accessions-not-downloaded.txt

## downloading assembly summaries (if not present in cwd already) ##
if [ ! -s ncbi_assembly_info.tsv ]; then

    printf "    ${GREEN}Downloading ncbi assembly summaries to be able to construct ftp links...${NC}\n\n"
    curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt > ncbi_assembly_info.tsv || echo "failed" > capture_any_dl_errors.tmp

    # making sure file downloaded with no errors, trying again if it did not (i can't figure out why, but it fails sometimes and then works others)
    # trying a max of 10 times
    attempt=1

    while [ -s capture_any_dl_errors.tmp ]; do

        if [[ ${attempt} -lt 16 ]]; then

            # waiting a few seconds
            sleep 3

            # incrementing counter
            attempt=$((attempt+1))

            printf "\n    ${YELLOW}Snag downloading GenBank assembly summary table, trying up to 15 times. Attempt: ${attempt}${NC}\n\n"

            if ! curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt > ncbi_assembly_info.tsv ; then
                echo "failed" > capture_any_dl_errors.tmp
            else
                rm -rf capture_any_dl_errors.tmp
            fi

        else

                printf "\n\n  ${RED}Download of NCBI assembly summaries failed :(${NC}\n  Is the internet connection weak?\n\nExiting for now.\n\n"
                rm -rf capture_any_dl_errors.tmp ncbi_assembly_info.tsv
                exit

        fi

    done

    rm -rf capture_any_dl_errors.tmp

    printf "\n"
    curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt >> ncbi_assembly_info.tsv || echo "failed" > capture_any_dl_errors.tmp

    # doing the same as above, trying a few times because sometimes seems to work and others not for reasons I can't pin down yet
    attempt=1

    while [ -s capture_any_dl_errors.tmp ]; do

        if [[ ${attempt} -lt 16 ]]; then

            # incrementing counter
            attempt=$((attempt+1))

            printf "\n    ${YELLOW}Snag downloading RefSeq assembly summary table, trying up to 15 times. Attempt: ${attempt}${NC}\n\n"

            if ! curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt >> ncbi_assembly_info.tsv ; then
                echo "failed" > capture_any_dl_errors.tmp
            else
                rm -rf capture_any_dl_errors.tmp
            fi

        else

                printf "\n\n  ${RED}Download of NCBI assembly summaries failed :(${NC}\n  Is the internet connection weak?\n\nExiting for now.\n\n"
                rm -rf capture_any_dl_errors.tmp ncbi_assembly_info.tsv
                exit

        fi

    done

    rm -rf capture_any_dl_errors.tmp
    
    printf "\n    ${YELLOW}Assembly summary tables downloaded successfully. They are stored\n"
    printf "    in the current working directory as \"ncbi_assembly_info.tsv\".\n"
    printf "    That file is left here in case you'd like to use it again soon. But be\n"
    printf "    sure to delete it if you'd like. Now getting to work on our targets.${NC}\n\n"

fi


## parsing assembly summaries and generating base ftp link info tab ##
tmp_file=$(date +%s).bit-dl-ncbi.tmp
helper-bit-parse-assembly-summary-file.py -a ncbi_assembly_info.tsv -w $NCBI_acc_file -o $tmp_file

# checking which not found:
comm -23 <(sort $NCBI_acc_file) <(cut -f1 $tmp_file | sort) > NCBI-accessions-not-found.tmp

# reporting
if [ -s NCBI-accessions-not-found.tmp ]; then

    mv NCBI-accessions-not-found.tmp NCBI-accessions-not-found.txt
    num_not_found=$(wc -l NCBI-accessions-not-found.txt | sed 's/^ *//' | cut -d " " -f 1)
    remaining_targets=$(($NCBI_input_genomes_total - $num_not_found))

    printf "\n     ${RED}******************************* ${NC}NOTICE ${RED}*******************************${NC}  \n"
    printf "\t\t  $num_not_found input accessions were not found at NCBI.\n\n"
    printf "\t\t  Written to \"NCBI-accessions-not-found.txt\".\n"
    printf "     ${RED}********************************************************************** ${NC}\n\n"

    printf "\t  Remaining total targets: $remaining_targets\n\n"

else

    rm NCBI-accessions-not-found.tmp
    remaining_targets=$NCBI_input_genomes_total

fi

## setting proper extention for download ##
if [ $format == "genbank" ]; then
    ext="_genomic.gbff.gz"
    my_ext=".gb.gz"
elif [ $format == "fasta" ]; then
    ext="_genomic.fna.gz"
    my_ext=".fa.gz"
elif [ $format == "protein" ]; then
    ext="_protein.faa.gz"
    my_ext=".faa.gz"
elif [ $format == "gff" ]; then
    ext="_genomic.gff.gz"
    my_ext=".gff.gz"
elif [ $format == "nt_cds" ]; then
    ext="_cds_from_genomic.fna.gz"
    my_ext="_cds_from_genomic.fna.gz"
elif [ $format == "feature_tab" ]; then
    ext="_feature_table.txt.gz"
    my_ext=".tsv.gz"
elif [ $format == "report" ]; then
    ext="_assembly_report.txt"
    my_ext="_assembly_report.txt"
elif [ $format == "stats" ]; then
    ext="_assembly_stats.txt"
    my_ext="_assembly_stats.txt"
fi


### running downloads in serial, unless specified to run in parallel ###
if [ $num_jobs == "1" ]; then

    while IFS=$'\t' read -r -a curr_line

    do

        assembly="${curr_line[0]}"
        downloaded_accession="${curr_line[1]}"
        num=$((num+1))

        printf "\r\t  Numero $num of $remaining_targets: $assembly"

        # storing and building links
        base_link="${curr_line[8]}"

        # checking link was actually present (sometimes, very rarely, it is not there)
        # if not there, attempting to build ourselves
        if [ $base_link == "na" ] || [ -z $base_link ]; then

            p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all")

            # checking if GCF or GCA
            if [[ $assembly == "GCF"* ]]; then 
                p2="GCF"
            else
                p2="GCA"
            fi

            p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
            p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
            p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)

            ass_name="${curr_line[2]}"
            end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name"))

            base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path"))

        else

            end_path=$(basename $base_link)

        fi

        # attempting to download file for assembly
        curl --silent --retry 10 -o ${assembly}${my_ext} "${base_link}/${end_path}${ext}"

        # grabbing this to check if " XML " is in there
            # when this was first written, trying to download a link that wasn't there would fail
            # now it can download an xml-formated file saying the link wasn't found at NCBI
            # so this let's us check for that, and report and remove it if that's the case
        file_command_output=$(file ${assembly}${my_ext})

        if [ ! -s ${assembly}${my_ext} ] || [[ ${file_command_output} == *" XML "* ]] || [[ ${file_command_output} == *" XHTML "* ]]; then

            printf "\n     ${RED}******************************* ${NC}NOTICE ${RED}*******************************${NC}  \n"
            printf "\t    $assembly's $format file didn't download successfully.\n"
            printf "\t    That file type may not exist for this accession.\n\n"
            printf "\t    Written to \"NCBI-accessions-not-downloaded.txt\".\n"
            printf "     ${RED}********************************************************************** ${NC}\n\n"

            echo ${assembly} >> NCBI-accessions-not-downloaded.txt

            rm -rf ${assembly}${my_ext}

        fi

    done < $tmp_file

else
    cat $tmp_file | parallel -j $num_jobs helper-bit-dl-ncbi-assemblies-parallel.sh {} $my_ext $ext $format
fi

rm $tmp_file

printf "\r                                                               "
printf "\n\t\t\t  ${GREEN}DONE!${NC}\n\n"
